clear
set more off
macro drop all
capture log close

/********************************************************************************
Discrimination in Multi-Phase Systems: Evidence from Child Protection
Clean Case Data

Created on: 11/28/17

Last Modified on: 2/20/2024

Description: This file cleans the raw case data.

Note that we have removed the file directory names from this program for 
confidentiality reasons.
********************************************************************************/

** Setting the Directory
global rawdata 
global cleandata 
global tmp 

/********************************************************************************

The raw case data should be unique at the victim*case level although 
there are some duplicates. 

There are 2 sections of this do file:
1) Clean Variables
2) Collapse to Child*Case level

*******************************************************************************/

*************************
**1) CLEAN VARIABLES
*************************

**Load raw allegations data
use "${rawdata}case.dta", clear

**Drop children who were not the allegation victim (these are children who might
**have been interviewed as part of the investigation process but were not the 
**subject of the investigation themselves).
drop if child_role!="Alleged Victim (AV)"

**Rename and label variables
rename intakechildvicpartyid vicid
la var vicid "Child Victim ID"

la var intake_id "Intake Case ID"

rename investigation_caseid inv_caseid
la var inv_caseid "Investigation Case ID"

rename srcinvestcase src_caseid
la var src_caseid "SRC Case ID"

rename ongoing_caseid ong_caseid
la var ong_caseid "Ongoing Case ID"

gen categ=.
replace categ=1 if catdesc==1
replace categ=2 if catdesc==2
replace categ=3 if catdesc==3
replace categ=4 if catdesc==4
replace categ=5 if catdesc==5
la var categ "Investigation Severity Category"
drop catdesc

gen complaintdate=date(complaint_date, "YMD")
format complaintdate %d
la var complaintdate "Date of Complaint"
drop complaint_date
rename complaintdate complaint_date

la var complaint_year "Year of Complaint"

rename childzip5 zipcode_vic
la var zipcode_vic "Victim's Zipcode"

rename perpzip5 zipcode_perp
la var zipcode_perp "Perpetrator's Zipcode"

drop childzip4 perpzip4

duplicates drop

*************************
**2) COLLAPSE TO CHILD*CASE LEVEL
*************************

duplicates report vicid inv_caseid

/*
--------------------------------------
   copies | observations       surplus
----------+---------------------------
        1 |      1215659             0
        2 |       357728        178864
        3 |        24600         16400
        4 |         4732          3549
        5 |          245           196
        6 |          312           260
        8 |            8             7
--------------------------------------

*/

duplicates tag vicid inv_caseid, gen(dups)

**The main reason for duplicates is a conflict in the perpetrator's zipcode. I'll
**use the modal value if one exists and set to missing otherwise.
sort vicid inv_caseid zipcode_perp
bysort vicid inv_caseid: gen zipcode_perp_flag=1 if zipcode_perp[1]!=zipcode_perp[_N] & dups>0
bysort vicid inv_caseid: egen zipcode_perp_mode=mode(zipcode_perp) if dups>0
replace zipcode_perp=zipcode_perp_mode if zipcode_perp_flag==1 
drop zipcode_perp_flag zipcode_perp_mode dups
duplicates drop

duplicates report vicid inv_caseid
/*

--------------------------------------
   copies | observations       surplus
----------+---------------------------
        1 |      1399096             0
        2 |         9508          4754
        3 |          432           288
        4 |           56            42
--------------------------------------

*/
duplicates tag vicid inv_caseid, gen(dups)

**There are still some duplicates, though. The reason for these are different complaint
**dates and different intake case id's. For obs with different complaint dates, always
**keep the earlier date.  For obs with different intake id's, save both since I'll
**need the intake id to merge on the foster care data.  

*****(a) Different complaint dates
sort vicid inv_caseid complaint_date
bysort vicid inv_caseid: gen date_flag=1 if complaint_date[1]!=complaint_date[_N] & dups>0
bysort vicid inv_caseid: egen complaint_date_min=min(complaint_date) if date_flag==1
replace complaint_date=complaint_date_min if date_flag==1
drop date_flag complaint_date_min 
**update complaint year to match change
drop complaint_year
gen complaint_year=year(complaint_date)
la var complaint_year "Year of Complaint"

*****(b) Different intake id's
sort vicid inv_caseid intake_id
bysort vicid inv_caseid: gen intake_flag=1 if intake_id[1]!=intake_id[_N]
bysort vicid inv_case: egen intake_id_max=max(intake_id) if intake_flag==1
gen intake_id2_tmp=intake_id if intake_flag==1 & intake_id!=intake_id_max
bysort vicid inv_caseid: egen intake_id2=max(intake_id2_tmp)
la var intake_id2 "Alternate Intake ID"
replace intake_id=intake_id_max if intake_flag==1
drop intake_id_max intake_flag intake_id2_tmp dups

duplicates drop

duplicates report vicid inv_caseid
/*

--------------------------------------
   copies | observations       surplus
----------+---------------------------
        1 |      1403919             0
        2 |          178            89
--------------------------------------

*/
duplicates tag vicid inv_caseid, gen(dups)


**Still some duplicates to deal with! These are because of a discrepancy in the 
**child's zipcode. Set to conflicting cases to missing 
sort vicid inv_caseid zipcode_vic
bysort vicid inv_caseid: gen zipcode_vic_flag=1 if zipcode_vic[1]!=zipcode_vic[_N] & dups>0
bysort vicid inv_caseid: egen zipcode_vic_mode=mode(zipcode_vic) if dups>0
replace zipcode_vic=zipcode_vic_mode if zipcode_vic_flag==1 
drop zipcode_vic_flag zipcode_vic_mode dups
duplicates drop

duplicates report vicid inv_caseid
/*
--------------------------------------
   copies | observations       surplus
----------+---------------------------
        1 |      1404008             0
--------------------------------------

*/

compress
sort vicid inv_caseid
order vicid inv_caseid complaint* 
save "${cleandata}case_clean.dta", replace


















